###Figure 9: replicate Figures 5-8 with additional control for PID


library(foreign)
library(readstata13)
library(dplyr)

setwd("C:/Users/kevin/Dropbox/Social_Media_Lab/Data_Survey/YouGov_Data/apsr_replication/")
load("merged_nr_soma.RData")



## Recode the Twitter frequency use variable to better reflect the underlying categories
## Roughly, how many times per month do you go on Twitter?
data$twitter_freq_inc<-as.numeric(
  revalue(as.character(data$twitter_freq_inc) , 
          c("7"= "90", "6"="30", "5" = " 15", "4" = "7", "3"="2", 
            "2"="1", "1"="0")))

###need the all_tweets variable un-logged, and to avoid dividing by zero

data$tweets_all_topics<-(1.0001 +data$tweets_all_topics)

data$all_tweets_log<-1/log(data$tweets_all_topics)

#########################################################################
##Loop over all models
#########################################################################



controls <- c("woman", "age", "lowerclass", "profile_education_age", 
              "white_british", "married", "newsnight_freq", "religious",
              "internet_freq_inc",  "newspaper_type", "pid" )




full_parties <- c("Labour", "UKIP", "LibDem", "Tories")

parties <- c("labour", "UKIP", "libdem", "conserv")
issues <- c("EU", "spending", "immigration")
waves <- c("w1", "w2", "w3", "w4")
t_parties <- c("labo", "ukip", "lide", "tory")

##need to change t_issues to accomodate isis

qs <- c("isis","unemployment",  "immigrants")
t_issues <- c("isis", "economy", "immigr")
t_waves <- c("p1", "p2", "p3", "p4")
pid <- c(2, 5, 3, 1)
tt_issues <- c("isis", "econ", "imm")




##
names<-list()

models_combo=list()
models_lm=list()
models_full=list()
models_full_raw=list()


labo_means=list()
labo_ses=list()
ukip_means=list()
ukip_ses=list()
lide_means=list()
lide_ses=list()
tory_means=list()
tory_ses=list()
right_media_means=list()
right_media_ses=list()
cent_media_means=list()
cent_media_ses=list()
left_media_means=list()
left_media_ses=list()




## Here the outcome is whether or not they were correct in wave 3,
## the baseline is whether or not they were correct in wave 2
## We go from Wave 2 to Wave 3 because these are the only waves we asked these questions

for (j in 1:length(qs)){

  data$outcome<-abs(eval(parse(text=paste0("data$fact_" , qs[j], 
                                           "_w3_correct_dk0" ))))
  
  data$baseline<-abs(eval(parse(text=paste0("data$fact_" , qs[j], 
                                            "_w2_correct_dk0"))))
  
  ##Look at the relevant tweets from each party in the 3rd time period
  data$tweet_count_labo<-  (
    eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_labo"))) +
      eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_labo")))
  )
  
  data$tweet_count_lide<-  (
    eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_lide"))) +
      eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_lide")))
  )
  
  data$tweet_count_ukip<-  (
    eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_ukip"))) +
      eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_ukip")))
  )
  
  data$tweet_count_tory<-  (
    eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_tory"))) +
      eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_tory")))
  )
  
  
  
  
  data$right_media<-{ 
    + eval(parse(text=paste0("data$p3_media_right_", t_issues[j])))
  }
  
  data$right_media_log<-log(1.00001 + data$right_media)
  
  data$left_media<-{ 
    + eval(parse(text=paste0("data$p3_media_left_", t_issues[j])))
  }
  
  data$left_media_log<-log(1.00001 + data$left_media)
  
  
  data$cent_media<-{ 
    + eval(parse(text=paste0("data$p3_media_cent_", t_issues[j])))
  }
  
  data$cent_media_log<-log(1.00001 + data$cent_media)
  
  
  
  ##create aggregate variables 
  data$parties<-log(1.0001+ 
                      (data$tweet_count_lide) + (data$tweet_count_labo) +
                      (data$tweet_count_ukip) + (data$tweet_count_tory))
  
  data$media<-log(1.0001+data$cent_media + data$left_media + data$right_media)
  
  data$total<-log(1.0001+ 
                    (data$tweet_count_lide) + (data$tweet_count_labo) +
                    (data$tweet_count_ukip) + (data$tweet_count_tory) +
                    data$cent_media + data$left_media + data$right_media)
  
 
  
  ##log the main variables 
  
  data$tweet_count_tory<-log(1.00001 + data$tweet_count_tory)
  
  data$tweet_count_labo<-log(1.00001 + data$tweet_count_labo)
  
  data$tweet_count_lide<-log(1.00001 + data$tweet_count_lide)
  
  data$tweet_count_ukip<-log(1.00001 + data$tweet_count_ukip)
  
  
  ##various model specifications
  
  #full
  models_full[[j]]<-summary(glm(paste0("outcome ~ baseline + total + twitter_freq_inc  +"
                                       , paste0(controls, collapse="+")), data=data, na.action=na.exclude, family="binomial"))
  
  models_full_raw[[j]]<-(glm(paste0("outcome ~ baseline + total + twitter_freq_inc  +"
                                    , paste0(controls, collapse="+")), data=data, na.action=na.exclude, family="binomial"))
  
  #combo
  models_combo[[j]]<-summary(glm(paste0("outcome ~ baseline + media + parties + twitter_freq_inc  +"
                                           , paste0(controls, collapse="+")), data=data, na.action=na.exclude, family="binomial"))
  
  
  
  #no interactions
  
  models_lm[[j]]<-summary(glm(paste0("outcome ~ baseline + tweet_count_labo + tweet_count_ukip + tweet_count_lide + tweet_count_tory +  
                                     right_media_log + left_media_log + cent_media_log+ twitter_freq_inc  + "
                                     , paste0(controls, collapse="+")), data=data, na.action=na.exclude, family="binomial"))
  
  
}

################graphing--all tweets

full_qs<-c("ISIS", "Unemployment", "Immigration")


names<-c("ISIS", "Unemployment", "Immigration")
index<-c(1, 2, 3)
##extract mean values 
means<-c(models_full[[1]]$coefficients["total","Estimate"],
         models_full[[2]]$coefficients["total","Estimate"],
         models_full[[3]]$coefficients["total","Estimate"]
         
         
)

##extract standard errors values 
ses<-c(models_full[[1]]$coefficients["total","Std. Error"],
       models_full[[2]]$coefficients["total","Std. Error"],
       models_full[[3]]$coefficients["total","Std. Error"]
)


means<-unlist(means)
ses<-unlist(ses)

##plot 
pdf("results/all_w2w3_fact_correct_total_pid.pdf", 8,4)
par(mar=c(4, 8, 4, 2)  )
plot( means, index, xlim=c(-.6, .6), ylab="", yaxt="n", xlab="Logistic regression coefficients on number of relevant tweets",
 #     main = paste0("Wave 2--Wave 3 Effects of Tweets on Issue-Specific Accuracy"), pch=3)
main = "Replication of Figure 5", pch=3)


axis(2, at=index, labels =paste0(names), las=2)
abline(v=0)
points(means + 1.64*ses, index, pch="|")
points(means - 1.64*ses, index, pch="|")
for(i in 1:length(means)){
  lines(c(means[i] - 1.96*ses[i],means[i] + 1.96*ses[i]), c(index[i],index[i]))
}

dev.off()



################graphing--combined results

full_qs<-c("ISIS", "Unemployment", "Immigration")

all_means<-vector()
all_ses<-vector()


for(i in 1:length(models_combo)){
  names<-c("Parties", "Media")
  index<-c(.1, .2)
  ##extract mean values 
  means<-c(models_combo[[i]]$coefficients["parties","Estimate"],
           models_combo[[i]]$coefficients["media","Estimate"])
  
  ##extract standard errors values 
  ses<-c(models_combo[[i]]$coefficients["parties","Std. Error"],
         models_combo[[i]]$coefficients["media","Std. Error"])  
  
  
  all_means<-c(all_means,unlist(means))
  all_ses<-c(all_ses,unlist(ses))
  
  
  
}





####################################

#Combine into one graph


##plot 
pdf(paste0("results/all_w2w3_fact_correct_combo_pid.pdf"), 8,4)
par(mar=c(4, 10, 4, 2)  )
index<-seq(.1, .6, length.out = 6)
plot( all_means, index, xlim=c(-.6, .6), ylim=c(.09, .61), ylab="", yaxt="n", xlab="Logistic regression coefficients on number of relevant tweets",
     # main = paste0("Wave 2--Wave 3 Effects of Tweets on Issue-Specific Accuracy"), pch=3)
     main = "Replication of Figure 6", pch=3)


names<- c("ISIS: Parties", "ISIS: Media", "Unemployment: Parties","Unemployment: Media","Immigration: Parties",
          "Immigration: Media"          )
axis(2, at=index, labels =paste0(names), las=2)
abline(v=0)
points(all_means + 1.64*all_ses, index, pch="|")
points(all_means - 1.64*all_ses, index, pch="|")
for(i in 1:length(all_means)){
  lines(c(all_means[i] - 1.96*all_ses[i],all_means[i] + 1.96*all_ses[i]), c(index[i],index[i]))
}

dev.off()




##########################graphing--divided results 

full_qs<-c("ISIS", "Unemployment", "Immigration")

for(i in 1:length(models_lm)){
  
  names<-c(full_parties, "Right media",  "Cent media","Left media")
  index<-seq(7)
  ##extract mean values 
  means<-c(models_lm[[i]]$coefficients["tweet_count_labo","Estimate"],
           models_lm[[i]]$coefficients["tweet_count_ukip","Estimate"],
           models_lm[[i]]$coefficients["tweet_count_lide","Estimate"],
           models_lm[[i]]$coefficients["tweet_count_tory","Estimate"],
           models_lm[[i]]$coefficients["right_media_log","Estimate"],
           models_lm[[i]]$coefficients["cent_media_log","Estimate"],
           models_lm[[i]]$coefficients["left_media_log","Estimate"]
  )
  
  ##extract standard errors values 
  ses<-c(models_lm[[i]]$coefficients["tweet_count_labo","Std. Error"],
         models_lm[[i]]$coefficients["tweet_count_ukip","Std. Error"],
         models_lm[[i]]$coefficients["tweet_count_lide","Std. Error"],
         models_lm[[i]]$coefficients["tweet_count_tory","Std. Error"],
         models_lm[[i]]$coefficients["right_media_log","Std. Error"],
         models_lm[[i]]$coefficients["cent_media_log","Std. Error"],
         models_lm[[i]]$coefficients["left_media_log","Std. Error"]
  )  
  
  
  
  means<-unlist(means)
  ses<-unlist(ses)
  
  ##plot 
  pdf(paste0("results/",full_qs[i],"_w2w3_fact_correct_pid.pdf"), 12,8)
  par(mar=c(4, 8, 4, 2)  )
  plot( means, index,   xlim=c(-.6, .6), ylab="", yaxt="n", xlab="Logistic regression coefficients on number of relevant tweets",
     #   main = paste0("Wave 2--Wave 3 Effects of Tweets on Issue-Specific Accuracy: ", full_qs[i]), pch=3)
  
          main =paste0("Replication of Figures 7 and 8:", full_qs[i]), pch=3)
  axis(2, at=index, labels =paste0(names), las=2)
  abline(v=0)
  points(means + 1.64*ses, index, pch="|")
  points(means - 1.64*ses, index, pch="|")
  for(i in 1:length(means)){
    lines(c(means[i] - 1.96*ses[i],means[i] + 1.96*ses[i]), c(index[i],index[i]))
  }
  
  dev.off()
  
  
}




####Ordinal analysis for Table 7

t_issues=c("isis", "economy", "immigr")

data$fact_immigrants_w2_ordinal<-as.numeric(data$fact_immigrants_w2)
data$fact_immigrants_w2_ordinal[data$fact_immigrants_w2_ordinal==5]<-NA

data$fact_immigrants_w3_ordinal<-as.numeric(data$fact_immigrants_w3)
data$fact_immigrants_w3_ordinal[data$fact_immigrants_w3_ordinal==5]<-NA

data$fact_immigrants_w3_ordinal<-factor(data$fact_immigrants_w3_ordinal , ordered=T)
data$fact_immigrants_w2_ordinal<-factor(data$fact_immigrants_w2_ordinal , ordered=T)


j<-3
data$outcome=eval(parse(text=paste0("data$fact_immigrants_w3_ordinal" )))
data$baseline=eval(parse(text=paste0("data$fact_immigrants_w2_ordinal" )))

##Look at the relevant tweets from each party in the 3rd time period
data$tweet_count_labo<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_labo"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_labo")))
)

data$tweet_count_lide<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_lide"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_lide")))
)

data$tweet_count_ukip<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_ukip"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_ukip")))
)

data$tweet_count_tory<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_tory"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_tory")))
)




data$right_media<-{ 
  + eval(parse(text=paste0("data$p3_media_right_", t_issues[j])))
}

data$right_media_log<-log(1.00001 + data$right_media)

data$left_media<-{ 
  + eval(parse(text=paste0("data$p3_media_left_", t_issues[j])))
}

data$left_media_log<-log(1.00001 + data$left_media)


data$cent_media<-{ 
  + eval(parse(text=paste0("data$p3_media_cent_", t_issues[j])))
}

data$cent_media_log<-log(1.00001 + data$cent_media)





##log the main variables 

data$tweet_count_tory<-log(1.00001 + data$tweet_count_tory)

data$tweet_count_labo<-log(1.00001 + data$tweet_count_labo)

data$tweet_count_lide<-log(1.00001 + data$tweet_count_lide)

data$tweet_count_ukip<-log(1.00001 + data$tweet_count_ukip)

library(MASS)

imm<-(polr(paste0("outcome ~ baseline+tweet_count_labo + tweet_count_ukip + tweet_count_lide + tweet_count_tory +  
                  right_media_log + left_media_log + cent_media_log  +twitter_freq_inc + 
                  
                  "    , paste0(controls, collapse="+")              ), data=data,  method="probit",
           Hess=TRUE))


summary(imm)

## can also do a bit with the unemployment


##gotta arrange the responses so it goes from lower to higher 
data$fact_unemployment_w2_ordinal<-as.numeric(data$fact_unemployment_w2)
data$fact_unemployment_w2_ordinal[data$fact_unemployment_w2_ordinal==1]<-3
data$fact_unemployment_w2_ordinal[data$fact_unemployment_w2_ordinal==2]<-1
data$fact_unemployment_w2_ordinal[data$fact_unemployment_w2_ordinal==3]<-2
data$fact_unemployment_w2_ordinal[data$fact_unemployment_w2_ordinal==4]<-NA


table(as.numeric(data$fact_unemployment_w2))
data$fact_unemployment_w3_ordinal<-as.numeric(data$fact_unemployment_w3)
data$fact_unemployment_w3_ordinal1<-as.numeric(data$fact_unemployment_w3)

data$fact_unemployment_w3_ordinal[data$fact_unemployment_w3_ordinal1==3]<-2
data$fact_unemployment_w3_ordinal[data$fact_unemployment_w3_ordinal1==1]<-3
data$fact_unemployment_w3_ordinal[data$fact_unemployment_w3_ordinal1==2]<-1
data$fact_unemployment_w3_ordinal[data$fact_unemployment_w3_ordinal1==4]<-NA


data$fact_unemployment_w3_ordinal<-factor(data$fact_unemployment_w3_ordinal , ordered=T)
data$fact_unemployment_w2_ordinal<-factor(data$fact_unemployment_w2_ordinal , ordered=T)

##set the issuese
j<-2
data$outcome=eval(parse(text=paste0("data$fact_unemployment_w3_ordinal" )))
data$baseline=eval(parse(text=paste0("data$fact_unemployment_w2_ordinal" )))

##Look at the relevant tweets from each party in the 3rd time period
data$tweet_count_labo<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_labo"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_labo")))
)

data$tweet_count_lide<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_lide"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_lide")))
)

data$tweet_count_ukip<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_ukip"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_ukip")))
)

data$tweet_count_tory<-  (
  eval(parse(text=paste0("data$p3_1", "_", t_issues[j], "_tory"))) +
    eval(parse(text=paste0("data$p3_2", "_", t_issues[j], "_tory")))
)




data$right_media<-{ 
  + eval(parse(text=paste0("data$p3_media_right_", t_issues[j])))
}

data$right_media_log<-log(1.00001 + data$right_media)

data$left_media<-{ 
  + eval(parse(text=paste0("data$p3_media_left_", t_issues[j])))
}

data$left_media_log<-log(1.00001 + data$left_media)


data$cent_media<-{ 
  + eval(parse(text=paste0("data$p3_media_cent_", t_issues[j])))
}

data$cent_media_log<-log(1.00001 + data$cent_media)






##log the main variables 

data$tweet_count_tory<-log(1.00001 + data$tweet_count_tory)

data$tweet_count_labo<-log(1.00001 + data$tweet_count_labo)

data$tweet_count_lide<-log(1.00001 + data$tweet_count_lide)

data$tweet_count_ukip<-log(1.00001 + data$tweet_count_ukip)




##check what's in here
unem<-(polr(paste0("outcome ~ baseline+tweet_count_labo + tweet_count_ukip + tweet_count_lide + tweet_count_tory +  
                   right_media_log + left_media_log + cent_media_log  +twitter_freq_inc + 
                   
                   "    , paste0(controls, collapse="+")              ), data=data,  method="probit",
            Hess=TRUE))



summary(unem)
library(stargazer)
stargazer(unem, imm)


